from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit"
value="Click here to toggle on/off the raw code."></form>''')
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
""")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import os.path
import matplotlib.patches as mpatches
import sqlite3
import datetime
import nltk
nltk.download('stopwords', quiet=True)
import plotly.express as px
from nltk.corpus import stopwords
from IPython.display import Image
from IPython.core.display import HTML
from wordcloud import WordCloud
from sqlalchemy import create_engine
from sqlalchemy.engine.reflection import Inspector
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
plt.rcParams["figure.figsize"] = (12, 8)
from sklearn import decomposition
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean, cityblock
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.base import clone
def build_df(df, features_to_be_used, decade=None):
"""Return filtered pandas DataFrame, `players` names and `feature_names`
Parameters
----------
df : pandas DataFrame
original df data for filtering
features_to_be_used : list
list to retain as features
decade : int, optional
`Year` filter
Returns:
----------
df : pandas DataFrame
Filtered df based on inputs `year`, `min_mins_played` and `pos`
song_titles : list
list of songs
feature_names : list
list of feature names
"""
if decade is not None:
df = df[df['decade'] == decade]
songs = list(df.loc[:, 'title'])
artists = list(df.loc[:, 'artist'])
df = df.loc[:, features_to_be_used]
feature_names = list(df.columns)
return df, songs, artists, feature_names
def pooled_within_ssd(X, y, centroids, dist):
"""Compute pooled within-cluster sum of squares around the cluster mean
Parameters
----------
X : array
Design matrix with each row corresponding to a point
y : array
Class label of each point
centroids : array
Cluster centroids
dist : callable
Distance between two points. It should accept two arrays, each
corresponding to the coordinates of each point
Returns
-------
float
Pooled within-cluster sum of squares around the cluster mean
"""
ssd = 0
for cluster_group in set(y):
for j in range(X.shape[0]):
in_cluster_dist = dist(X[j], centroids[cluster_group])
count_cluster_size = y.tolist().count(cluster_group)
ssd += np.where(y[j] == cluster_group,
in_cluster_dist**2/(2*count_cluster_size),
0)
return ssd
def gap_statistic(X, y, centroids, dist, b, clusterer, random_state=None):
"""Compute the gap statistic
Parameters
----------
X : array
Design matrix with each row corresponding to a point
y : array
Class label of each point
centroids : array
Cluster centroids
dist : callable
Distance between two points. It should accept two arrays, each
corresponding to the coordinates of each point
b : int
Number of realizations for the reference distribution
clusterer : KMeans
Clusterer object that will be used for clustering the reference
realizations
random_state : int, default=None
Determines random number generation for realizations
Returns
-------
gs : float
Gap statistic
gs_std : float
Standard deviation of gap statistic
"""
rng = np.random.default_rng(random_state)
W_k = pooled_within_ssd(X, y, centroids, dist)
gap_s = []
W_ki_s = []
clusterer = clone(clusterer)
for iteration in range(b):
x1 = rng.uniform(low=X.min(axis=0), high=X.max(axis=0), size=X.shape)
y1 = clusterer.fit_predict(x1)
W_ki = pooled_within_ssd(x1, y1, clusterer.cluster_centers_, dist)
gap_s.append(np.log(W_ki) - np.log(W_k))
gs = np.mean(gap_s)
gss = np.std(gap_s)
return gs, gss
def cluster_range(X, clusterer, k_start, k_stop, actual=None):
"""
Return a dictionary of cluster labels, internal validation values, and
if actual labels is given, external validation values, for every k.
Parameters
----------
X : array
Design matrix with each row corresponding to a point
clusterer : KMeans
Clusterer object that will be used for clustering the reference
realizations
k_start : int
Initial value of cluster steps
k_stop : int
Final value of cluster steps
actual : array, optional
List of ground-truth labels
Returns
-------
cluster_range : dictionary
dictionary of cluster labels, internal validation values, and
external validation values, for every k
"""
X = X.to_numpy()
ys = []
centers = []
inertias = []
chs = []
scs = []
gss = []
gssds = []
ps = []
amis = []
ars = []
for k in range(k_start, k_stop+1):
clusterer_k = clone(clusterer)
clusterer_k.set_params(n_clusters=k)
y = clusterer_k.fit_predict(X)
ys.append(y)
centers.append(clusterer_k.cluster_centers_)
inertias.append(clusterer_k.inertia_)
chs.append(calinski_harabasz_score(X, y))
scs.append(silhouette_score(X, y))
gs = gap_statistic(X, y, clusterer_k.cluster_centers_,
euclidean, 5,
clone(clusterer).set_params(n_clusters=k),
random_state=1337)
gss.append(gs[0])
gssds.append(gs[1])
if actual is not None:
ps.append(purity(actual, y))
amis.append(adjusted_mutual_info_score(actual, y))
ars.append(adjusted_rand_score(actual, y))
cluster_range = {'ys':ys, 'centers': centers, 'inertias':inertias,
'chs':chs, 'scs':scs, 'gss':gss, 'gssds':gssds}
if actual is not None:
cluster_range['ps'] = ps
cluster_range['amis'] = amis
cluster_range['ars'] = ars
return cluster_range
def k_means_test(df, features, clusterer=None, decade=None, k=2, pc=7,
plot=False, k_start=1, k_end=1, actual=None):
"""
Return cluster labels and standardized design matrix.
Parameters
----------
df : pandas DataFrame
df for processing
features : list
list of features
decade : int, optional
`Year` filter
clusterer : KMeans
Clusterer object that will be used for clustering the reference
realizations
k : int, optional
number of clusters to be formed
pc : int, optional
number of principal components to be used
plot : boolean, optional
Indicator whether an axes plot is to be shown or not
k_stop : int
Final value of cluster steps
actual : array, optional
List of ground-truth labels
Returns
-------
y_predict : array
array of predicted clusters according to `clusterer`
X_std : numpy design matrix
numpy matrix of standardized features
"""
# filter df for analysis
df_filtered, songs, artists, feature_names = build_df(df,
features,
decade=decade)
# Standardize input pandas DataFrame
X = df_filtered.to_numpy()
X_std = (X - X.mean(axis=0))/X.std(axis=0)
pca = decomposition.PCA()
X_std_PCA = pca.fit_transform(X_std)
p = pca.components_.T
kmeans_df = KMeans(n_clusters=k, random_state=1337)
y_predict = kmeans_df.fit_predict(X_std_PCA[: , :pc])
y_centers = kmeans_df.cluster_centers_
spotify_color = (sns.color_palette('Greens')[1::2][::-1]
+ sns.color_palette('binary')[0:4])
X_std_PCA_df = pd.DataFrame(X_std_PCA)
X_std_PCA_df['y_predict'] = y_predict
if plot == True:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4), dpi=100)
k_clusters = str(k) + ' clusters'
sns.scatterplot(data=X_std_PCA_df, x=0, y=1, hue='y_predict',
palette=spotify_color[:k], ax=ax1)
sns.scatterplot(data=pd.DataFrame(y_centers), x=0, y=1, color='red',
ax=ax1)
for cluster, vec in zip(range(k), y_centers):
cluster_label = 'Cluster ' + str(cluster) + ' centroid'
ax1.text(vec[0], vec[1]+0.1, cluster_label, ha='center',
color='black', fontsize=10)
ax1.set_xlabel('PC1')
ax1.set_ylabel('PC2')
ax1.set_title('Scatter Plot with Clustering')
res_decade = cluster_range(df_filtered, clusterer, k_start, k_end,
actual)
ks = np.arange(2, len(res_decade['inertias'])+2)
ax2.plot(ks, res_decade['inertias'], '-o', color=spotify_color[1],
label='SSE')
ax2.plot(ks, res_decade['chs'], '-o', color=spotify_color[3],
label='CH')
ax2.set_xlabel('$k$')
ax2.set_ylabel('SSE/CH')
lines, labels = ax2.get_legend_handles_labels()
ax2a = ax2.twinx()
ax2a.errorbar(ks, res_decade['gss'], res_decade['gssds'], fmt='-o',
color=spotify_color[6], label='Gap statistic')
ax2a.plot(ks, res_decade['scs'], '-o', color=spotify_color[0],
label='Silhouette coefficient')
ax2a.set_ylabel('Gap statistic/Silhouette')
lines2, labels2 = ax2a.get_legend_handles_labels()
ax2a.legend(lines+lines2, labels+labels2)
ax2.set_title('Internal Validation')
plt.tight_layout()
plt.show()
return y_predict, X_std
def boxplot(df, decade):
"""Return boxplots of features in the `df` per `decade`."""
df = pd.DataFrame(df, columns=features)
df['cluster'] = y_predict
spotify_color = (sns.color_palette('Greens')[1::2][::-1]
+ sns.color_palette('binary')[0:4])
order = decade
fig, ax = plt.subplots(3, 3, figsize=(6, 6), dpi=100)
features_for_plot = [['loudness', 'danceability', 'energy'],
['acousticness', 'valence', 'tempo'],
['instrumentalness', 'speechiness', 'liveness']]
for i in range(3):
for j in range(3):
sns.boxplot(data=df, x='cluster', y=features_for_plot[i][j],
palette=spotify_color, linewidth=1, ax=ax[i][j])
ax[i][j].set(title=features_for_plot[i][j], xlabel='cluster',
ylabel='')
plt.tight_layout()
plt.show()
def wordcloud_clstr(df, cluster, show='song'):
"""Return wordcloud plots under column `show` from column `cluster`
in `df`.
"""
from wordcloud import WordCloud, STOPWORDS
comment_words = ''
add_words = ['remastered', 'remaster', 'version', 'single',
'mono', 'stereo', 'feat']
stopwords = set(list(STOPWORDS) + add_words)
for val in df_all.loc[df_all['cluster_label'] == cluster][show]:
# typecast each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "
return WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 20).generate(comment_words)
spotify_color = (sns.color_palette('Greens')[1::2][::-1]
+ sns.color_palette('binary')[0:4])
features = ['danceability', 'energy', 'key', 'loudness', 'mode',
'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo']
len_features = len(features)
decade_list = [50, 60, 70, 80, 90, 0, 10]
clusters_per_decade = [3, 2, 2, 2, 3, 3, 4]
pcs_per_decade = [7, 7, 7, 7, 7, 7, 7]
Image(filename="header.png")
| Feature | Type | Description |
|---|---|---|
| id | str | Unique identifier for Spotify tracks |
| artist | str | Artist per track |
| title | str | Title per track |
| release_date | dt | Date when the track was released |
| popularity | int | Spotify's internal popularity ranking system from 0 to 100 |
| decade | int | Decade when the track was released |
| danceability | float | Dance suitability of track |
| energy | float | Perceptual measure of intensity and activity |
| key | int | Pitch or scale of the track |
| loudness | float | Loudness of track in decibels |
| mode | int | Modality (major or minor) of a track |
| speechiness | float | Presence of spoken words in a track |
| acousticness | float | Confidence measure from 0.0 to 1.0 of whether the track is acoustic |
| instrumentalness | float | Prediction of whether a track contains vocals |
| liveness | float | Prediction of audience presence in a track |
| valence | float | Musical positiveness of a track |
| tempo | float | Estimated tempo in beats per minute |
# Sample dataframe
df_csv = pd.read_csv('dmw_spotify.csv')
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Table 3.2 Sample Dataframe</b></center>'))
display(df_csv.head())
# Display Table
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Table 3.3 Feature Description</b></center>'))
display(df_csv.describe())
Image(filename="method.PNG")
# Plot feature trends per decade
spotify_color = (sns.color_palette('Greens')[1::2][::-1]
+ sns.color_palette('binary')[0:4])
sns.color_palette(spotify_color)
def plot_boxplot_eda(df_csv, col1, col2, col3):
"""Plot boxplots of each feature per decade."""
spotify_color = (sns.color_palette('Greens')[1::2][::-1]
+ sns.color_palette('binary')[0:4])
order = [50, 60, 70, 80, 90, 0, 10]
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 3), dpi=100)
sns.boxplot(data=df_csv, x='decade', y=col1, palette=spotify_color,
linewidth=1, ax=ax1, order=order)
ax1.set(title=col1.title(), xlabel='Decade', ylabel='')
sns.boxplot(data=df_csv, x='decade', y=col2, palette=spotify_color,
linewidth=1, ax=ax2, order=order)
ax2.set(title=col2.title(), xlabel='Decade', ylabel='')
sns.boxplot(data=df_csv, x='decade', y=col3, palette=spotify_color,
linewidth=1, ax=ax3, order=order)
ax3.set(title=col3.title(), xlabel='Decade', ylabel='')
plt.show()
plot_boxplot_eda(df_csv, 'loudness', 'danceability', 'energy')
plot_boxplot_eda(df_csv, 'acousticness', 'valence', 'tempo')
plot_boxplot_eda(df_csv, 'instrumentalness', 'speechiness', 'liveness')
def plot_pca_and_var_exp(df, features, decade=None, c1=0, c2=1, t=0.8):
"""Plots loadings graph for PC1 and PC2 and variance explained graph
Parameters
----------
df : pandas DataFrame
df for processing
features : list
list of features for loadings plot
decade : int, optional
`Year` filter
c1 : int, optional
index no. 1 of the principal component to be plotted
c2 : int, optional
index no. 2 of the principal component to be plotted
t : float, optional
threshold to use for the number of principal components to be returned
Returns
----------
pc_no_threshold: int
number of PCs to be returned meeting the threshold `t`
"""
# filter df for analysis
df_filtered, songs, artists, feature_names = build_df(df,
features,
decade=decade)
# Standardize input pandas DataFrame
X = df_filtered.to_numpy()
X_std = (X - X.mean(axis=0))/X.std(axis=0)
pca = decomposition.PCA()
X_std_PCA = pca.fit_transform(X_std)
# Initialize subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4), dpi=100)
# First axes: plot the variance explained
pca_var = pca.explained_variance_ratio_
sns.lineplot(range(1, len(pca_var) + 1), pca_var*100,
label='individual', marker="o", color=spotify_color[6],
alpha=0.75, ax=ax1)
sns.lineplot(range(1, len(pca_var) + 1), pca_var.cumsum()*100,
label='cumulative', marker="o", color=spotify_color[0],
alpha=0.75, ax=ax1)
ax1.legend(fontsize=8)
ax1.set_ylim(0, 105)
ax1.set_xlabel('PC')
ax1.set_ylabel('variance explained (%)')
pc_no_threshold = np.sum(np.where(pca_var.cumsum() < t, 1, 0)) + 1
# Second axes: scatter plot of observations against PC1 and PC2
# Identify the max and min values of PC1 and PC2 and corresponding indexes
max_PC1 = np.amax(X_std_PCA[:, c1])
max_PC1_idx = np.argmax(X_std_PCA[:, c1])
max_PC1_PC2 = X_std_PCA[:, c2][max_PC1_idx]
min_PC1 = np.amin(X_std_PCA[:, c1])
min_PC1_idx = np.argmin(X_std_PCA[:, c1])
min_PC1_PC2 = X_std_PCA[:, c2][min_PC1_idx]
max_PC2 = np.amax(X_std_PCA[:, c2])
max_PC2_idx = np.argmax(X_std_PCA[:, c2])
max_PC2_PC1 = X_std_PCA[:, c1][max_PC2_idx]
min_PC2 = np.amin(X_std_PCA[:, c2])
min_PC2_idx = np.argmin(X_std_PCA[:, c2])
min_PC2_PC1 = X_std_PCA[:, c1][min_PC2_idx]
for_highlight = np.array([max_PC1_idx, max_PC2_idx,
min_PC1_idx, min_PC2_idx])
# Get PCAs
pcas = np.append(pca.components_.T[:, c1:c1+1],
pca.components_.T[:, c2:c2+1], axis=1)
# Compute for weights, rank, then get indices
weights = np.linalg.norm(pcas, axis=1)
indexes = weights.argsort()[-20:]
# Scatter plot for all the players and corresponding PC1 & PC2
sns.scatterplot(X_std_PCA[:, c1], X_std_PCA[:, c2], alpha=0.75,
color=spotify_color[5], ax=ax2)
# Scatter plot for players with extreme PC1 and PC2 values
ax2.scatter(X_std_PCA[for_highlight[:, None], c1],
X_std_PCA[for_highlight, c2],
alpha=0.8, color='black')
ax2.text(max_PC1-0.5, max_PC1_PC2, songs[max_PC1_idx],
ha='center', color='black', fontsize=8)
ax2.text(max_PC2_PC1, max_PC2-0.5, songs[max_PC2_idx],
ha='center', color='black', fontsize=8)
ax2.text(min_PC1+0.5, min_PC1_PC2, songs[min_PC1_idx],
ha='center', color='black', fontsize=8)
if decade == 90:
ax2.text(min_PC2_PC1, min_PC2+0.5, songs[min_PC2_idx][:30],
ha='center', color='black', fontsize=8)
else:
ax2.text(min_PC2_PC1, min_PC2+0.5, songs[min_PC2_idx],
ha='center', color='black', fontsize=8)
# Loadings plot for the individual statistics
X_std_PCA_df = pd.DataFrame(X_std_PCA)
feature_names = np.array(feature_names)
mult = 1.1*int(np.abs(max_PC1))
for feature, vec in zip(feature_names[indexes], pcas[indexes]):
ax2.arrow(0, 0, mult*vec[0], mult*vec[1], width=0.05, head_width=0.1,
ec='none', fc=spotify_color[0])
ax2.text((mult+1)*vec[0], (mult+1)*vec[1], feature, ha='center',
color='black', fontsize=6, alpha=1)
ax2.set_xlabel('PC1')
ax2.set_ylabel('PC2')
plt.tight_layout()
plt.show()
return pc_no_threshold
# Plot
pcs1950 = plot_pca_and_var_exp(df_csv, features, decade=50, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.2 Individual Variance Cumulative Variance of PCs '
'for 1950 (left) <br>Figure 5.3 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 1950 (right)'
'</b></center>'))
# Plot
pcs1960 = plot_pca_and_var_exp(df_csv, features, decade=60, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.4 Individual Variance Cumulative Variance of PCs '
'for 1960 (left) <br>Figure 5.5 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 1960 (right)'
'</b></center>'))
# Plot
pcs1970 = plot_pca_and_var_exp(df_csv, features, decade=70, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.6 Individual Variance Cumulative Variance of PCs '
'for 1970 (left) <br>Figure 5.7 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 1970 (right)'
'</b></center>'))
# Plot
pcs1980 = plot_pca_and_var_exp(df_csv, features, decade=80, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.8 Individual Variance Cumulative Variance of PCs '
'for 1980 (left) <br>Figure 5.9 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 1980 (right)'
'</b></center>'))
# Plot
pcs1990 = plot_pca_and_var_exp(df_csv, features, decade=90, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.10 Individual Variance Cumulative Variance of PCs '
'for 1990 (left) <br>Figure 5.11 Scatter Plot of Songs to '
'PC1 and PC2 with Loadings Plot for songs in 1990 (right)'
'</b></center>'))
# Plot
pcs2000 = plot_pca_and_var_exp(df_csv, features, decade=0, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.12 Individual Variance Cumulative Variance of PCs '
'for 2000 (left) <br>Figure 5.13 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 2000 (right)'
'</b></center>'))
# Plot
pcs2010 = plot_pca_and_var_exp(df_csv, features, decade=10, c1=0, c2=1, t=0.8)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.14 Individual Variance Cumulative Variance of PCs '
'for 2010 (left) <br>Figure 5.15 Scatter Plot of Songs to PC1 '
'and PC2 with Loadings Plot for songs in 2010 (right)'
'</b></center>'))
def pca_weights(df, features, decade=None, n_features=11):
"""Plots bar graph containing top features influencing the PCs
Parameters
----------
df : pandas DataFrame
df for processing
features : list
list of feature_names for bar plot
top_n : int, optional
number pf PCs to be graphed
n_features : int, optional
number of features to be plot on each bar graph
"""
# filter df for analysis
df_filtered, songs, artists, feature_names = build_df(df,
features,
decade=decade)
# Standardize input pandas DataFrame
X = df_filtered.to_numpy()
X_std = (X - X.mean(axis=0))/X.std(axis=0)
pca = decomposition.PCA()
X_std_PCA = pca.fit_transform(X_std)
p = pca.components_.T
fig, ax = plt.subplots(1, 2, figsize=(8, 2.5), dpi=100)
for PC in range(2):
order = np.argsort(np.abs(p[:, PC]))[-1*n_features:]
ax.flatten()[PC].barh([feature_names[o] for o in order], p[order, PC],
color='forestgreen')
ax.flatten()[PC].tick_params(labelsize=8)
ax.flatten()[PC].set_title(f'PC{PC+1}', size=10)
ax.flatten()[PC].set_xlim(-0.65, 0.65)
ax.flatten()[PC].spines['top'].set_visible(False)
ax.flatten()[PC].spines['bottom'].set_visible(False)
ax.flatten()[PC].spines['left'].set_visible(False)
ax.flatten()[PC].spines['right'].set_visible(False)
ax.flatten()[PC].tick_params(which='both', width=0)
plt.show()
pca_weights(df_csv, features, decade=50, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.16 Principal Components 1 and 2 with Top Features '
' for songs in 1950</b></center>'))
pca_weights(df_csv, features, decade=60, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.17 Principal Components 1 and 2 with Top Features '
' for songs in 1960</b></center>'))
pca_weights(df_csv, features, decade=70, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.18 Principal Components 1 and 2 with Top Features '
' for songs in 1970</b></center>'))
pca_weights(df_csv, features, decade=80, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.19 Principal Components 1 and 2 with Top Features '
' for songs in the 1980s </b> </center>'))
pca_weights(df_csv, features, decade=90, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.20 Principal Components 1 and 2 with Top Features '
' for songs in the 1990s</b></center>'))
pca_weights(df_csv, features, decade=0, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.21 Principal Components 1 and 2 with Top Features '
' for songs in the 2000s</b></center>'))
pca_weights(df_csv, features, decade=10, n_features=len_features)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.22 Principal Components 1 and 2 with Top Features '
' for songs in the 2010s</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=50, k=3,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.23 Cluster Scatter Plot and Internal '
'Validation Chart for the 1950s </b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 50);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.24 Audio Features per Cluster in the 1950s '
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=60, k=2,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.25 Cluster Scatter Plot and Internal '
'Validation Chart in the 1960s</b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 60);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.26 Audio Features per Cluster in the 1960s'
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=70, k=2,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.27 Cluster Scatter Plot and Internal '
'Validation Chart in the 1970s</b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 70);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.28 Audio Features per Cluster in the 1970s'
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=80, k=2,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.29 Cluster Scatter Plot and Internal '
'Validation Chart in the 1980s </b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 80);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.30 Audio Features per Cluster in the 1980s'
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=90, k=3,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.31 Cluster Scatter Plot and Internal '
'Validation Chart in the 1990s </b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 90);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.30 Audio Features per Cluster in the 1990s'
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=0, k=3,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.33 Cluster Scatter Plot and Internal '
'Validation Chart in the 1990s </b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 0);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.34 Audio Features per Cluster in the 2000s'
'</b></center>'))
# plot clustering scatter plot and internal validation chart
y_predict, X_std = k_means_test(df_csv, features, decade=10, k=4,
pc=7, plot=True,
clusterer=KMeans(random_state=1337),
k_start=2, k_end=11, actual=None)
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.35 Cluster Scatter Plot and Internal '
'Validation Chart in the 2010s</b></center>'))
# plot audio feature boxplot for the decade
boxplot(X_std, 10);
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.36 Audio Features per Cluster in the 2010s'
'</b></center>'))
| Cluster Label | Matching Cluster Groups | General Description | Defining Features |
|---|---|---|---|
| Easy Pop/Rock | A | Happy, upbeat, easy listening songs | High: Energy, Valence, Tempo Low: Danceability, Acousticness |
| Senti | B | Sentimental, mellow songs | High: Acousticness Low: Loudness, Danceability, Energy, Valence |
| Move | C | Happy, energetic dance songs | High: Loudness, Danceability, Energy, Valence Low: Acousticness |
| Chill | D | Sentimental, mellow dance songs | High: Danceability, Acousticness Low: Loudness, Energy, Valence, Tempo |
| Alternative Pop | E | Sentimental, upbeat and energetic songs | High: Loudness, Energy, Tempo Low: Danceability, Acousticness, Valence |
# Prepare dataframe with Cluster_Label column
df_all = pd.DataFrame()
# Mapping of Cluster Labels
dec50map = {0: 'Easy Pop/Rock', 1: 'Senti', 2: 'Move'}
dec60map = {0: 'Easy Pop/Rock', 1: 'Senti'}
dec70map = {0: 'Move', 1: 'Senti'}
dec80map = {0: 'Move', 1: 'Chill'}
dec90map = {0: 'Senti', 1: 'Easy Pop/Rock', 2: 'Move'}
dec00map = {0: 'Move', 1: 'Senti', 2: 'Alternative/Pop',}
dec10map = {0: 'Outlier', 1: 'Chill', 2: 'Move', 3: 'Alternative/Pop'}
mapping = [dec50map, dec60map, dec70map, dec80map, dec90map,
dec00map, dec10map]
# Iterate numper of PCs, clusters and label per cluster per decade
for decade, cluster_count, pc_count, map_val in zip(decade_list,
clusters_per_decade,
pcs_per_decade,
mapping):
# Build the dataframe according to necessary filters
df_filtered, songs, artists, feature_names = build_df(df_csv,
features,
decade=decade)
# Get centers and processed design matrix
y_kmeans, X_std = k_means_test(df_csv, features, decade=decade,
k=cluster_count,
pc=pc_count,
plot=False)
# Build DataFrame with additional relevant features
df_with_cluster_kmeans = pd.concat([df_filtered.reset_index(),
pd.DataFrame(songs,
columns=['song']),
pd.DataFrame(artists,
columns=['artist']),
pd.DataFrame(y_kmeans,
columns=['cluster'])],
axis=1)
df_with_cluster_kmeans.drop(columns='index', inplace=True)
df_with_cluster_kmeans['decade'] = decade
df_with_cluster_kmeans['cluster_label'] = (df_with_cluster_kmeans
['cluster'].map(map_val))
# Append for all decades and resulting clusters
df_all = df_all.append(df_with_cluster_kmeans)
# Plot wordcloud of song titles and artists
fig = plt.figure(figsize = (15, 15))
for i, show_ in enumerate(['song', 'artist']):
ax = fig.add_subplot(1, 2, i+1)
wordcloud = wordcloud_clstr(df_all, 'Easy Pop/Rock', show=show_)
title_plot = ['Songs', 'Artists']
ax.imshow(wordcloud)
ax.set_title(title_plot[i], fontsize=20)
ax.axis('off')
# Plot wordcloud of song titles and artists
fig = plt.figure(figsize = (15, 15))
for i, show_ in enumerate(['song', 'artist']):
ax = fig.add_subplot(1, 2, i+1)
wordcloud = wordcloud_clstr(df_all, 'Senti', show=show_)
title_plot = ['Songs', 'Artists']
ax.imshow(wordcloud)
ax.set_title(title_plot[i], fontsize=20)
ax.axis('off')
# Plot wordcloud of song titles and artists
fig = plt.figure(figsize = (15, 15))
for i, show_ in enumerate(['song', 'artist']):
ax = fig.add_subplot(1, 2, i+1)
wordcloud = wordcloud_clstr(df_all, 'Move', show=show_)
title_plot = ['Songs', 'Artists']
ax.imshow(wordcloud)
ax.set_title(title_plot[i], fontsize=20)
ax.axis('off')
# Plot wordcloud of song titles and artists
fig = plt.figure(figsize = (15, 15))
for i, show_ in enumerate(['song', 'artist']):
ax = fig.add_subplot(1, 2, i+1)
wordcloud = wordcloud_clstr(df_all, 'Chill', show=show_)
title_plot = ['Songs', 'Artists']
ax.imshow(wordcloud)
ax.set_title(title_plot[i], fontsize=20)
ax.axis('off')
# Plot wordcloud of song titles and artists
fig = plt.figure(figsize = (15, 15))
for i, show_ in enumerate(['song', 'artist']):
ax = fig.add_subplot(1, 2, i+1)
wordcloud = wordcloud_clstr(df_all, 'Alternative/Pop', show=show_)
title_plot = ['Songs', 'Artists']
ax.imshow(wordcloud)
ax.set_title(title_plot[i], fontsize=20)
ax.axis('off')
# Plot distribution of clusters per decade
spotify_color_stacked = (sns.color_palette('Greens')[1::2][::-1] +
sns.color_palette('binary')[:4:3] +
sns.color_palette('binary')[-1::])
# Set theme
sns.set_style("white")
sns.set_palette(spotify_color_stacked)
# Group by decade
summary = (df_all.groupby(['decade', 'cluster_label'])
['song'].count().unstack().fillna(0))
custom_dict = {50: 0, 60: 1, 70: 2, 80: 3, 90: 4, 0: 5, 10: 6}
summary['rank'] = summary.index.map(custom_dict)
summary.sort_values(by=['rank'],inplace=True)
# now drop the 'rank' column
summary.drop(labels=['rank'],axis=1, inplace=True)
summary = summary[['Move', 'Easy Pop/Rock', 'Outlier',
'Alternative/Pop', 'Chill', 'Senti']]
summary.plot(kind='bar', stacked=True, figsize=(8,6))
plt.xticks(ticks=range(7), labels=[1950, 1960, 1970, 1980, 1990, 2000, 2010])
plt.xlabel('Decade')
plt.ylabel('Number of Songs')
plt.legend(loc='best')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=12)
plt.xticks(rotation=0)
plt.title('Distribution of Identified Music Clusters Across the Decades')
plt.show()
display(HTML('<center style="font-size:14px;font-style:default;">'
'<b>Figure 5.43 Cluster Distribution Across the'
' Decades</b></center>'))